if(!require(haven)){install.packages("haven")}; library(haven)
if(!require(miceadds)){install.packages("miceadds")}; library(miceadds)
if(!require(glmmML)){install.packages("glmmML")}; library(glmmML)
if(!require(Amelia)){install.packages("Amelia")}; library(Amelia)
if(!require(lmtest)){install.packages("lmtest")}; library(lmtest)
if(!require(sandwich)){install.packages("sandwich")}; library(sandwich)
if(!require(tidyverse)){install.packages("tidyverse")}; library(tidyverse)
if(!require(MASS)){install.packages("MASS")}; library(MASS)
if(!require(mvtnorm)){install.packages("mvtnorm")}; library(mvtnorm)
if(!require(texreg)){install.packages("texreg")}; library(texreg)
if(!require(stargazer)){install.packages("stargazer")}; library(stargazer)
if(!require(xtable)){install.packages("xtable")}; library(xtable)


# set working directory

# import data 
data = read_dta("Data-file-for-table-replications.dta")

################################################################################################
# DATA CLEANING FOR REPLICATION OF ANDERSON ET AL. ORIGINAL RESULTS (TABLE 2 COLUMN 4) #########
################################################################################################

# create trend variable
trend = data$year - 1899

# create city indicator 
for (j in data$state_city_id) {
  new_col = paste("city_FE", j, sep = "_")
  data[,new_col] = as.numeric(data$state_city_id == j) 
} 

# create dataset of only city indicator 
indicators = data %>% dplyr::select(starts_with("city_FE"))

# create interaction terms of city indicator*trend
interactions = indicators*trend
colnames(interactions) = paste(colnames(interactions),"trend", sep = "_")

# put interactions back into dataset 
data2 = data %>% dplyr::select(-starts_with("city_FE"))
data3 = base::cbind(data2, interactions)

# dataset for table2 column 4 regression 
tbl2col4 =  data3 %>%
  mutate(year = as.factor(year)) %>%
  mutate(state_city_id = as.factor(state_city_id)) %>%
  dplyr::select(tb_lungs, ln_tb_lungs, sanatorium, hospital, openair, city_reporting_required, state_reporting_required, 
                city_disinfect_required, state_disinfect_required, anti_spit, city_common_cup_ban, state_common_cup_ban,
                city_tb_board, state_tb_board, dispensary,
                per_female, per_black, per_foreign, per_under18, per_literate, 
                state_city_id, year, population_imp, 
                state_fips, missing_demographics, ends_with("_missing"), starts_with("city_FE"))

rm(data2); rm(data3)

################################################################################################
# REPLICATE ORIGINAL RESULTS  OF OLS MODEL IN TABLE 2 COLUMN 4 IN ANDERSON ET AL. ##############
################################################################################################

cluster_linmod = lm.cluster(data = tbl2col4, 
                            ln_tb_lungs ~ . - population_imp - state_fips - tb_lungs,
                            cluster = tbl2col4$state_fips,
                            weights = tbl2col4$population_imp)

################################################################################################
# REPLICATE REGRESSION WITH CLASSICAL STANDARD ERRORS & INDICATOR METHOD FOR MISSING DATA ######
################################################################################################

classical_linmod = lm(data = tbl2col4, 
                      ln_tb_lungs ~ . - population_imp - state_fips - tb_lungs,
                      weights = tbl2col4$population_imp)

################################################################################################
## MULTIPLE IMPUTATION FOR MISSING DATA ########################################################
################################################################################################

# convert year to a numeric variable
data_amelia = tbl2col4 %>% mutate(year = as.numeric(tbl2col4$year))

# recode missing variables as missing
data_amelia = data_amelia %>%
  mutate(sanatorium = case_when(sanatorium_missing == 1 ~ NA_real_,
                                TRUE ~ sanatorium)) %>%
  mutate(openair = case_when(openair_missing == 1 ~ NA_real_,
                             TRUE ~ openair)) %>%
  mutate(hospital = case_when(hospital_missing == 1 ~ NA_real_,
                              TRUE ~ hospital)) %>%
  mutate(per_female = case_when(missing_demographics == 1 ~ NA_real_,
                                TRUE ~ per_female)) %>%
  mutate(per_black = case_when(missing_demographics == 1 ~ NA_real_,
                               TRUE ~ per_black)) %>%
  mutate(per_foreign = case_when(missing_demographics == 1 ~ NA_real_,
                                 TRUE ~ per_foreign)) %>%
  mutate(per_under18 = case_when(missing_demographics == 1 ~ NA_real_,
                                 TRUE ~ per_under18)) %>%
  mutate(per_literate = case_when(missing_demographics == 1 ~ NA_real_,
                                  TRUE ~ per_literate)) %>%
  mutate(city_tb_board = case_when(city_tb_board_missing == 1 ~ NA_real_,
                                   TRUE ~ city_tb_board)) %>%
  mutate(city_reporting_required = case_when(city_reporting_required_missing == 1 ~ NA_real_,
                                             TRUE ~ city_reporting_required)) %>%
  mutate(anti_spit = case_when(anti_spit_missing == 1 ~ NA_real_,
                               TRUE ~ anti_spit)) %>%
  mutate(city_common_cup_ban = case_when(city_common_cup_ban_missing == 1 ~ NA_real_,
                                         TRUE ~ city_common_cup_ban)) %>%
  mutate(city_disinfect_required = case_when(city_disinfect_required_missing == 1 ~ NA_real_,
                                             TRUE ~ city_disinfect_required)) %>%
  mutate(dispensary = case_when(dispensary_missing == 1 ~ NA_real_,
                                TRUE ~ dispensary))

# drop missing indicators
data_amelia = data_amelia %>%
  dplyr::select(-ends_with("_missing")) %>%
  dplyr::select(-"missing_demographics")

# for proportions with values equal to 0, recode as 0.00000001
data_amelia = data_amelia %>%
  mutate(per_black = case_when(per_black == 0 ~ 1e-10,
                               TRUE ~ per_black)) %>%
  mutate(per_foreign = case_when(per_foreign == 0 ~ 1e-10,
                                 TRUE ~ per_foreign))

# use Amelia to impute 
set.seed(02138)
data_imputed = amelia(data_amelia, 
                      m = 5, 
                      ts = "year", #ts is the variable identifying time in time series data
                      cs = "state_city_id", #cs is the variable indicating cross section variable
                      polytime = 1, #linear time trends  
                      intercs = FALSE, #no time interaction with cross section variable 
                      noms = c(3:6, 8, 10, 11, 13, 15), #these are the nominal variables (must be 0 or 1)  
                      lgstc = c("per_female", "per_black", "per_under18", "per_foreign", "per_literate"), #use logisic transformation for proportions (see Amelia documentation) 
                      sqrts = "population_imp", #use sqrt transformation for counts (see Amelia documentation)
                      idvars = c(25:574))

# reconstruct imputed datasets to include original NAs in outcome variables (i.e., remove imputed dependent variables)
data_analyze = list()
for(i in 1:data_imputed$m){
  data_analyze[[i]] = data_imputed$imputations[[i]] %>%
    mutate(tb_lungs = tbl2col4$tb_lungs) %>%
    mutate(ln_tb_lungs = tbl2col4$ln_tb_lungs)
}

# check if successful
for(i in 1:data_imputed$m){
  try(if(sum(is.na(tbl2col4$tb_lungs)) != sum(is.na(data_analyze[[i]]$tb_lungs)))
    stop(paste("Incorrect substitution:", i)))
  
  try(if(sum(is.na(tbl2col4$ln_tb_lungs)) != sum(is.na(data_analyze[[i]]$ln_tb_lungs)))
    stop(paste("Incorrect substitution", i)))
}

################################################################################################
# REPLICATE OLS REGRESSION WITH MULTIPLE IMPUTATION FOR MISSING DATA ###########################
################################################################################################

# re-run linear cluster, with missing data imputed
b.out    = NULL   # this will hold coefficients from each imputed dataset
se.out   = NULL   # this will hold standard errors from each imputed dataset
vcov.out = list() # this will hold vcov matrix from each imputed dataset
ols.out  = list() # this will hold the fitted model from each imputed dataset

for(i in 1:data_imputed$m){
  data_analyze[[i]] = data_analyze[[i]] %>%
    mutate(year = as.factor(year)) #change year to a factor for year fixed effects
  
  ols.out[[i]] = lm.cluster(data = data_analyze[[i]], #run regression with clustered ses on each imputed dataset
                             ln_tb_lungs ~ . - population_imp - state_fips - tb_lungs, 
                             cluster = data_analyze[[i]]$state_fips,
                             weights = data_analyze[[i]]$population_imp)
  
  # technically do not need these when using ols.out[[i]] and not using mi.meld
  b.out  = rbind(b.out, summary(ols.out[[i]])[,1]) #join data together 
  se.out = rbind(se.out, summary(ols.out[[i]])[,2])
  vcov.out[[i]] = ols.out$vcov
} # note: this for-loop takes a bit of time to run

# get coefficients and standard errors for all five imputations combined 
combined_imputations_lm = mi.meld(q = b.out, se = se.out, byrow = TRUE) #see Amelia documentation for mi.meld

# calculate degrees of freedom in model [should be same as df in classical se mod]
df = df.residual(classical_linmod)

# summarize results from imputed model 
summary_mi_lm = as_tibble(cbind(t(combined_imputations_lm$q.mi),
                                 t(combined_imputations_lm$se.mi))) %>%
  magrittr::set_colnames(c("estimate", "std.error")) %>%
  mutate(statistic = estimate / std.error,
         conf.low = estimate + std.error * qt(0.025, df),
         conf.high = estimate + std.error * qt (0.975, df),
         p.value = 2 * pt(abs(statistic), df, lower.tail = FALSE)) %>%
  magrittr::set_rownames(c(colnames(b.out)))


##########################################################################################################
## RUN NEW POISSON REGRESSION WITH MULTIPLE IMPUTATION FOR MISSING DATA & CLUSTERED STANDARD ERRORS ######
##########################################################################################################

b.out    = NULL    # this will hold coefficients from each imputed dataset
se.out   = NULL    # this will hold standard errors from each imputed dataset
pois.out  = list() # this will hold the fitted model from each imputed dataset

for(i in 1:data_imputed$m){
  data_analyze[[i]] = data_analyze[[i]] %>%
    mutate(year = as.factor(year)) #change year to a factor for year fixed effects
  
  pois.out[[i]] = glm.cluster(data = data_analyze[[i]], 
                               tb_lungs ~ . - population_imp - state_fips - ln_tb_lungs + offset(log(population_imp)), 
                               cluster = data_analyze[[i]]$state_fips,
                               family = "poisson"(link = "log"))
  
  b.out  = rbind(b.out, summary(pois.out[[i]])[,1]) #join data together 
  se.out = rbind(se.out, summary(pois.out[[i]])[,2])
} # note: this for-loop takes a bit of time to run

# get coefficients and standard errors for all five imputations combined 
combined_imputations_pois = mi.meld(q = b.out, se = se.out, byrow = TRUE) #see Amelia documentation for mi.meld

# summarize results from imputed model 
summary_mi_pois = as_tibble(cbind(t(combined_imputations_pois$q.mi),
                                   t(combined_imputations_pois$se.mi))) %>%
  magrittr::set_colnames(c("estimate", "std.error")) %>%
  mutate(statistic = estimate / std.error,
         conf.low = estimate + std.error * qt(0.025, df),
         conf.high = estimate + std.error * qt (0.975, df),
         p.value = 2 * pt(abs(statistic), df, lower.tail = FALSE)) %>%
  magrittr::set_rownames(c(colnames(b.out)))


##########################################################################################################
## RUN NEW POISSON REGRESSION WITH MULTIPLE IMPUTATION FOR MISSING DATA & CLASSICAL STANDARD ERRORS ######
##########################################################################################################

b.out    = NULL   # this will hold coefficients from each imputed dataset
se.out   = NULL   # this will hold standard errors from each imputed dataset
pois.classicse.out  = list() # this will hold the fitted model from each imputed dataset

for(i in 1:data_imputed$m){
  data_analyze[[i]] = data_analyze[[i]] %>%
    mutate(year = as.factor(year)) #change year to a factor for year fixed effects
  
  pois.classicse.out[[i]] = glm(data = data_analyze[[i]], 
                               tb_lungs ~ . - population_imp - state_fips - ln_tb_lungs + offset(log(population_imp)), 
                               family = "poisson"(link = "log"))
  
  b.out  = rbind(b.out, summary(pois.classicse.out[[i]])$coefficients[,1]) #join data together 
  se.out = rbind(se.out, summary(pois.classicse.out[[i]])$coefficients[,2])
} # note: this for-loop takes a bit of time to run

# get coefficients and standard errors for all five imputations combined 
combined_imputations_pois_classicse = mi.meld(q = b.out, se = se.out, byrow = TRUE) #see Amelia documentation for mi.meld

# summarize results from imputed model 
summary_mi_pois_classicse = as_tibble(cbind(t(combined_imputations_pois_classicse$q.mi),
                                   t(combined_imputations_pois_classicse$se.mi))) %>%
  magrittr::set_colnames(c("estimate", "std.error")) %>%
  mutate(statistic = estimate / std.error,
         conf.low = estimate + std.error * qt(0.025, df),
         conf.high = estimate + std.error * qt (0.975, df),
         p.value = 2 * pt(abs(statistic), df, lower.tail = FALSE)) %>%
  magrittr::set_rownames(c(colnames(b.out)))


##########################################################################################################
# WRITE FUNCTIONS FOR SIMULATION FROM REGRESSION MODELS ##################################################
##########################################################################################################

# list interventions for simulations
interventions = c("sanatorium",
                  "hospital",
                  "openair",
                  "city_reporting_required",
                  "state_reporting_required",
                  "city_disinfect_required",
                  "state_disinfect_required",
                  "anti_spit",
                  "city_common_cup_ban",
                  "state_common_cup_ban",
                  "city_tb_board",
                  "state_tb_board",
                  "dispensary")

### FD SIMULATION FROM THE ORIGINAL OLS MODEL 

# A function to calculate the first difference from the original linear model (no imputed data)
calc_firstdiff_lm_orig = function(intervention_list){
  ## intervention_list: factor, "sanatorium", "hospital","openair", or other created categories to evaluate
  
  # call the OLS regression model that replicates the original results
  fit = cluster_linmod
  
  # simulate beta from multivariate normal
  simulated_beta = rmvnorm(5000, mean = summary(fit)[,1], sigma = fit$vcov)
  
  # obtain covariates and take median of observed covariates
  big_X = model.matrix(fit$lm_res)
  big_X = big_X[,which(colnames(big_X) %in% colnames(simulated_beta))]
  big_X = as.matrix(t(apply(X = big_X, MARGIN = 2, FUN = median)))
  
  # simulate qoi
  sims = lapply(intervention_list,
                 function (i) {
                   
                   # set covariates to appropriate values for intervention of interest
                   X_interv   = big_X
                   X_nointerv = big_X
                   
                   X_interv[, i]   = 1
                   X_nointerv[, i] = 0
                   
                   # calculate expected log(TB mortality rate) (estimation uncertainty) for 5000 sims
                   lntb_mort_interv   = X_interv %*% t(simulated_beta)
                   lntb_mort_nointerv = X_nointerv %*% t(simulated_beta)
                   
                   # simulate from normal distribution (fundamental uncertainty)
                   # each row is one of 5000 simulations, each column is a draw from that simulation (1000 draws from each sim)
                   lnY_interv   = sapply(lntb_mort_interv, function (j) {rnorm(1000, mean = j, sd = sd(fit$lm_res$residuals))})
                   lnY_nointerv = sapply(lntb_mort_nointerv, function (j) {rnorm(1000, mean = j, sd = sd(fit$lm_res$residuals))})
                   
                   # exponentiate to transform from log(TB mortality) to TB mortality
                   Y_interv   = exp(lnY_interv)
                   Y_nointerv = exp(lnY_nointerv)
                   
                   # average to find expected TB mortality for each simulation
                   EY_interv   = colMeans(Y_interv)
                   EY_nointerv = colMeans(Y_nointerv)
                   
                   # QOI is the difference in expected mortality rate with the intervention and without the intervention
                   fd = EY_interv - EY_nointerv
                   
                   # summarize the 5000 simulations by quantiles
                   result_q = quantile(fd, prob = c(0.025, 0.5, 0.975))
                   tibble(intervention = i,
                          p025 = result_q[1],
                          median = result_q[2],
                          p975 = result_q[3],
                          mean = mean(fd)) -> result
                   return(result)
                 }) %>% bind_rows()
  return(sims)
}

### FD SIMULATION FROM THE OLS MODEL WITH IMPUTED DATA

# A function to calculate first differences from the ols regression models run on imputed datasets
merge_simfromimpute_lm = function(intervention_list, m , model){
  ## intervention_list: factor, "sanatorium", "hospital","openair", etc
  ## m: numeric, the number of imputed datasets data_imputed$m
  ## model: fitted regression model used for simulation
  
  fd_sims = NULL
  
  for (i in 1:m){
    
    # call a specific imputed dataset
    fit = model[[i]]
    
    # simulate beta from multivariate normal
    no_sim   = 1/m*5000
    simulated_beta = rmvnorm(no_sim, mean = summary(fit)[,1], sigma = fit$vcov)
    
    # obtain covariates
    big_X = model.matrix(fit$lm_res)
    big_X = big_X[,which(colnames(big_X) %in% colnames(simulated_beta))]
    big_X = as.matrix(t(apply(X = big_X, MARGIN = 2, FUN = median)))
    
    X_interv   = big_X
    X_nointerv = big_X
    
    X_interv[, intervention_list]   = 1
    X_nointerv[, intervention_list] = 0
    
    # calculate expected log(TB mortality rate) [estimation uncertainty]
    lntb_mort_interv   = X_interv %*% t(simulated_beta)
    lntb_mort_nointerv = X_nointerv %*% t(simulated_beta)
    
    # simulate log(TB mortality rate) from normal distribution [fundamental uncertainty] 
    lnY_interv   = sapply(lntb_mort_interv, function (j) {rnorm(1000, mean = j, sd = sd(fit$lm_res$residuals))})
    lnY_nointerv = sapply(lntb_mort_nointerv, function (j) {rnorm(1000, mean = j, sd = sd(fit$lm_res$residuals))})
    
    # exponentiate to transform from log(TB mortality) to TB mortality
    Y_interv   = exp(lnY_interv)
    Y_nointerv = exp(lnY_nointerv)
    
    # average to find expected TB mortality for each simulation
    EY_interv   = colMeans(Y_interv)
    EY_nointerv = colMeans(Y_nointerv)
    
    fd = EY_interv - EY_nointerv
    
    fd_sims = rbind(fd_sims, fd)
  }
  return(fd_sims)
}


calc_firstdiff_impute_lm = function(intervention_list, m, model){
  
  result = lapply(intervention_list, function (k) {
    
    fd_sims = merge_simfromimpute_lm(intervention_list = k, m = m, model = model)
    
    # Reformat the result
    result_q = quantile(fd_sims, prob = c(0.025, 0.5, 0.975))
    tibble(intervention = k,
           p025 = result_q[1],
           median = result_q[2],
           p975 = result_q[3],
           mean = mean(fd_sims)) -> result2
    return(result2)
  }) %>% bind_rows()
  return(result)
}


### FD SIMULATION FROM THE POISSON MODEL WITH IMPUTED DATA

# A function to calculate first differences from the poisson regression models run on imputed datasets
merge_simfromimpute_pois = function(intervention_list, m , model){
  ## intervention_list: factor, "sanatorium", "hospital","openair", etc
  ## m: numeric, the number of imputed datasets data_imputed$m
  ## model: fitted regression model used for simulation
  
  fd_sims = NULL
  
  for (i in 1:m){
    
    # call a specific imputed dataset
    fit = model[[i]]
    
    # simulate beta from multivariate normal
    no_sim   = 1/m*5000
    simulated_beta = rmvnorm(no_sim, mean = summary(fit)[,1], sigma = fit$vcov)
    
    # obtain covariates
    big_X = model.matrix(fit$glm_res)
    big_X = big_X[,which(colnames(big_X) %in% colnames(simulated_beta))]
    big_X = as.matrix(t(apply(X = big_X, MARGIN = 2, FUN = median)))
    
    X_interv   = big_X
    X_nointerv = big_X
    
    X_interv[, intervention_list]   = 1
    X_nointerv[, intervention_list] = 0
    
    # calculate expected TB mortality rate
    tb_mort_interv   = colMeans(exp(X_interv %*% t(simulated_beta)))
    tb_mort_nointerv = colMeans(exp(X_nointerv %*% t(simulated_beta)))
    fd = (tb_mort_interv - tb_mort_nointerv)*100000 # re-scale to per 100,000 populations
    
    fd_sims = rbind(fd_sims, fd)
  }
  return(fd_sims)
}


calc_firstdiff_impute_pois = function(intervention_list, m, model){
  
  result = lapply(intervention_list, function (k) {
    
    fd_sims = merge_simfromimpute_pois(intervention_list = k, m = m, model = model)
    
    # Reformat the result
    result_q = quantile(fd_sims, prob = c(0.025, 0.5, 0.975))
    tibble(intervention = k,
           p025 = result_q[1],
           median = result_q[2],
           p975 = result_q[3],
           mean = mean(fd_sims)) -> result2
    return(result2)
  }) %>% bind_rows()
  return(result)
}

####################################################################################################
# RUN FUNCTIONS TO CALCULATE QOI (FIRST DIFFERENCE EFFECT ON MORTALITY RATE) FOR EACH INTERVENTION # 
####################################################################################################

set.seed(02138)
qoi_origdata_lmmod  = calc_firstdiff_lm_orig(intervention_list = interventions)
qoi_midata_lmmod    = calc_firstdiff_impute_lm(intervention_list = interventions, m = 5, model = ols.out)
qoi_midata_poismod  = calc_firstdiff_impute_pois(intervention_list = interventions, m = 5, model = pois.out)


####################################################################################################
## CREATE TABLES AND FIGURES #######################################################################
####################################################################################################

## FIGURES
# rename interventions in datasets for figure labels
figlabel <- c("Sanatorium",
              "Hospital",
              "Open-air camp",
              "City reporting ordinance",
              "State reporting ordinance",
              "City disinfection ordinance",
              "State disinfection ordinance",
              "Spitting ban",
              "City common cup ordinance",
              "State common cup ordinance",
              "Municipal TB association",
              "State TB association",
              "Dispensary"
              )

qoi_origdata_lmmod[,1] = figlabel
qoi_midata_lmmod[,1]   = figlabel
qoi_midata_poismod[,1] = figlabel

# FIGURE 1: simulation results for original model with indicator variables for missing data & original model with MI

qoi_origdata_lmmod$Data = "Original"
qoi_midata_lmmod$Data   = "Multiple Imputation"

compare_orig_mi = full_join(qoi_origdata_lmmod, qoi_midata_lmmod)

ggplot(data = compare_orig_mi, aes(x = reorder(intervention, mean), y = mean, color = Data)) +
  geom_point(size = 3) +
  scale_color_manual(values=c("orange", "black")) +
  coord_flip()+
  geom_linerange(aes(ymin = p025, ymax = p975))+
  theme_classic()+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(colour="black", size = 14))+
  theme(axis.text.y = element_text(colour="black", size = 14))+
  ylab("First difference in TB-mortality rate per 100,000 population")+
  geom_hline(yintercept = 0, size = 0.2, linetype = "dashed")+
  xlab(element_blank())


# FIGURE 2: residual plot for OLS
linmod_res = resid(classical_linmod)
drop_miss  = tbl2col4 %>% filter(is.na(ln_tb_lungs) == FALSE)
linplot    = as.data.frame(cbind(linmod_res, ln_pop = log(drop_miss$population_imp)))

ggplot(data = linplot, aes(x = ln_pop, y = linmod_res)) +
  geom_point() +
  theme_classic() +
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(colour="black", size = 14))+
  theme(axis.text.y = element_text(colour="black", size = 14))+
  ylab("Residuals") +
  xlab("log(population)")+
  ylim(-2.5,2.5)


# FIGURE 3: residual plot for Poisson
poismod_res <- resid(pois.classicse.out[[1]])
poisplot <- as.data.frame(cbind(poismod_res, ln_pop = log(drop_miss$population_imp)))

ggplot(data = poisplot, aes(x = ln_pop, y = poismod_res)) +
  geom_point() +
  theme_classic() +
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(colour="black", size = 14))+
  theme(axis.text.y = element_text(colour="black", size = 14))+
  ylab("Residuals") +
  xlab("log(population)")+
  ylim(-10,10)


# Figure 4: simulation results for OLS and Poisson with MI for missing data
qoi_midata_lmmod$Model = "Linear"
qoi_midata_poismod$Model = "Poisson"

compare_lm_pois = full_join(qoi_midata_lmmod, qoi_midata_poismod)

ggplot(data = compare_lm_pois, aes(x = reorder(intervention, mean), y = mean, color = Model)) +
  geom_point(size = 3, alpha = 0.6) +
  scale_color_manual(values=c("blue", "red")) +
  coord_flip()+
  geom_linerange(aes(ymin = p025, ymax = p975), size = 0.8, alpha = 0.35)+
  theme_classic()+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(colour="black", size = 14))+
  theme(axis.text.y = element_text(colour="black", size = 14))+
  ylab("First difference in TB-mortality rate per 100,000 population")+
  geom_hline(yintercept = 0, size = 0.2, linetype = "dashed")+
  xlab(element_blank())

## TABLES

#Table 1: median values of covariates
medians = as.matrix(t(apply(X = model.matrix(cluster_linmod$lm_res), MARGIN = 2, FUN = median)))
medians = as.matrix(medians[,2:19])
medians = round(medians, digits = 2)
xtable(medians, "latex")

#Table 2: summary statistics with missing data removed
stargazer(data_amelia[,1:21], summary.stat = c("n", "mean", "sd"))

#Table 3: classical vs. clustered standard errors for OLS regression
texreg(l = list(classical_linmod, cluster_linmod),
       custom.model.names = c("Classical Standard Errors", "Clustered Standard Errors"),
       stars = c(0.05),
       custom.coef.map = list("sanatorium" = "Sanatorium",
                              "hospital" = "TB hospital",
                              "openair" = "Open-air camp",
                              "city_reporting_required" = "Reporting ordinance",
                              "state_reporting_required" = "State reporting law",
                              "city_disinfect_required" = "Disinfection ordinance",
                              "state_disinfect_required" = "State disinfection law",
                              "anti_spit" = "Spitting ordinance",
                              "city_common_cup_ban" = "Common cup ordinance",
                              "state_common_cup_ban" = "State common cup law",
                              "city_tb_board" = "Municipal TB association",
                              "state_tb_board" = "State TB association",
                              "dispensary" = "Dispensary"),
       include.rsquared = FALSE,
       include.adjrs = FALSE,
       digits = 3)

#Table 4: classical vs. clustered standard errors for Poisson regression
compare_poismods = cbind(
  t(combined_imputations_pois_classicse$q.mi)[1:14], t(combined_imputations_pois_classicse$se.mi)[1:14], 
  t(combined_imputations_pois$q.mi)[1:14],  t(combined_imputations_pois$se.mi)[1:14])

colnames(compare_poismods) = c("coef.clas_pois",    "se.clas_pois", 
                               "coef.clus_pois",    "se.clus_pois")

compare_poismods = round(compare_poismods, digits = 3)

